## Replication Code for
## "Education and Social Capital"
## Apfeld, Coman, Gerring, and Jessee
## Journal of Experimental Political Science

## NOTE: This code requires the 
## following R packages to be installed 
## (version number used for paper in parentheses):
##     car (3.0-8)
##     mice (3.9.0)
##     rdrobust (0.99.9)
##     rdlocrand (0.7.1)
## users should be sure to install all of these 
## packages BEFORE running the code below 


library(car)
library(mice)
library(rdrobust)
library(rdlocrand)


## loading survey data
load("JEPS-replication-data.RData")

# creating recoded indicator variables
library(car) # to get recode()
# recoding vote indicators
# this function standardizes each item (also making it a vector)
scale.nomatrix <- function(x){as.vector(scale(x))}
# recoding voting indicators
DAT.WITH2019$iv1 <- scale.nomatrix(recode(DAT.WITH2019$q4, 
                                          '"Don\'t remember"=0; "No"=0; "Not old enough"=NA; "Yes"=1; else=NA', 
                                          as.numeric=TRUE))
DAT.WITH2019$iv2 <- scale.nomatrix(4 - DAT.WITH2019$q7_1)
# recoding other participation indicators
DAT.WITH2019$ip1 <- scale.nomatrix(DAT.WITH2019$q5)
DAT.WITH2019$ip2 <- scale.nomatrix(DAT.WITH2019$q6_5)
DAT.WITH2019$ip3 <- scale.nomatrix(4 - DAT.WITH2019$q7_2)
DAT.WITH2019$ip4 <- scale.nomatrix(4 - DAT.WITH2019$q7_3)
DAT.WITH2019$ip5 <- scale.nomatrix(4 - DAT.WITH2019$q7_4)
DAT.WITH2019$ip6 <- scale.nomatrix(4 - DAT.WITH2019$q7_5)
DAT.WITH2019$ip7 <- scale.nomatrix(4 - DAT.WITH2019$q7_6)
# recoding membership indicators
DAT.WITH2019$im1 <- scale.nomatrix(DAT.WITH2019$q6_2)
DAT.WITH2019$im2 <- scale.nomatrix(DAT.WITH2019$q6_3)
DAT.WITH2019$im3 <- scale.nomatrix(DAT.WITH2019$q6_4)
DAT.WITH2019$im4 <- scale.nomatrix(DAT.WITH2019$q6_6)
DAT.WITH2019$im5 <- scale.nomatrix(DAT.WITH2019$q6_7)
DAT.WITH2019$im6 <- scale.nomatrix(DAT.WITH2019$q6_8)
DAT.WITH2019$im7 <- scale.nomatrix(DAT.WITH2019$q6_9)
DAT.WITH2019$im8 <- scale.nomatrix(DAT.WITH2019$q6_10)
DAT.WITH2019$im9 <- scale.nomatrix(DAT.WITH2019$q6_11)
# recoding trust indicators
DAT.WITH2019$it1 <- scale.nomatrix(6-DAT.WITH2019$q2_1)
DAT.WITH2019$it2 <- scale.nomatrix(6-DAT.WITH2019$q2_2)


# calculating proportion of missing values
prop.missing.scap <- apply(DAT.WITH2019[, c("iv1", "iv2", "ip1", "ip2", "ip3", "ip4", 
                                            "ip5", "ip6", "ip7", "im1", "im2", "im3",
                                            "im4", "im5", "im6", "im7", "im8", "im9",
                                            "it1", "it2")],
                           1, function(x){mean(is.na(x))})

# loading mice library for imputation
library(mice)
indicator.imputation <- mice(DAT.WITH2019[prop.missing.scap < .5, c("iv1", "iv2", "ip1", "ip2", "ip3", "ip4", 
                                                                    "ip5", "ip6", "ip7", "im1", "im2", "im3",
                                                                    "im4", "im5", "im6", "im7", "im8", "im9",
                                                                    "it1", "it2")], 
                             m=1, maxit = 100, method = 'norm.predict', seed = 12345)
INDICATORS.IMP <- complete(indicator.imputation)

# calculating domain-specific means
DAT.WITH2019$ivbar <- DAT.WITH2019$ipbar <- DAT.WITH2019$imbar <- DAT.WITH2019$itbar <- DAT.WITH2019$scap <- NA
DAT.WITH2019$ivbar[prop.missing.scap < .5] <- (INDICATORS.IMP$iv1 + INDICATORS.IMP$iv2) / 2
DAT.WITH2019$ivbar <- (DAT.WITH2019$ivbar - mean(DAT.WITH2019$ivbar, na.rm=TRUE)) / sd(DAT.WITH2019$ivbar, na.rm=TRUE)
DAT.WITH2019$ipbar[prop.missing.scap < .5] <- (INDICATORS.IMP$ip1 + INDICATORS.IMP$ip2 + 
                                                 INDICATORS.IMP$ip3 + INDICATORS.IMP$ip4 +
                                                 INDICATORS.IMP$ip5 + INDICATORS.IMP$ip6 + 
                                                 INDICATORS.IMP$ip7) / 7
DAT.WITH2019$ipbar <- (DAT.WITH2019$ipbar - mean(DAT.WITH2019$ipbar, na.rm=TRUE)) / sd(DAT.WITH2019$ipbar, na.rm=TRUE)
DAT.WITH2019$imbar[prop.missing.scap < .5] <- (INDICATORS.IMP$im1 + INDICATORS.IMP$im2 + 
                                                 INDICATORS.IMP$im3 + INDICATORS.IMP$im4 + 
                                                 INDICATORS.IMP$im5 + INDICATORS.IMP$im6 + 
                                                 INDICATORS.IMP$im7 + INDICATORS.IMP$im8 + 
                                                 INDICATORS.IMP$im9) / 9
DAT.WITH2019$imbar <- (DAT.WITH2019$imbar - mean(DAT.WITH2019$imbar, na.rm=TRUE)) / sd(DAT.WITH2019$imbar, na.rm=TRUE)
DAT.WITH2019$itbar[prop.missing.scap < .5] <- (INDICATORS.IMP$it1 + INDICATORS.IMP$it2) / 2
DAT.WITH2019$itbar <- (DAT.WITH2019$itbar - mean(DAT.WITH2019$itbar, na.rm=TRUE)) / sd(DAT.WITH2019$itbar, na.rm=TRUE)
DAT.WITH2019$scap <- (DAT.WITH2019$ivbar + DAT.WITH2019$ipbar + DAT.WITH2019$imbar + DAT.WITH2019$itbar) / 4
DAT.WITH2019$scap <- (DAT.WITH2019$scap - mean(DAT.WITH2019$scap, na.rm=TRUE)) / sd(DAT.WITH2019$scap, na.rm=TRUE)

# recoding childhood SES recall question
DAT.WITH2019$childhoodSES <- recode(DAT.WITH2019$p22, "'Lower class'=1; 'Working class'=2; 'Lower middle class'=3; 'Upper middle class'=4; 'Upper class'=5; else=NA")
DAT.WITH2019$fatheredu <- DAT.WITH2019$p20

# 4 category father's education
DAT.WITH2019$fatheredu4 <- c(1,1,1,1,1,2,3,4,4,4)[DAT.WITH2019$fatheredu]

# subsetting data to drop GRADYEAR==2019
DAT <- subset(DAT.WITH2019, GRADYEAR!=2019)

## Assessing compliance
# How what percent of people who "passed" the Bac went to college? 
# ...and what percent who "failed" the Bac went to college?
prop.table(table(DAT$average >= 6, DAT$p6), margin=1)

####################################
## MAIN PAPER TABLES AND FIGURES ##
####################################


##############
## Figure 1 ##
##############
## [figure 1 is produced with other code 
##  since it uses different a dataset, see readme file]

##############
## Figure 2 ##
##############
## [figure 2 is produced with other code 
##  since it uses different a dataset, see readme file]

##############
## Figure 3 ##
##############

par(mfrow=c(1,2))
plot(tapply(DAT$p6, DAT$average, mean, na.rm=TRUE) ~ sort(unique(DAT$average)),
     cex=table(DAT$average)/15, pch=20, bty="n",
     xlab="Bac Score", ylab="Proportion attending university")
abline(v=6, lty=3, col="red")
abline(h=tapply(DAT$p6, DAT$average>=6, mean, na.rm=TRUE), lty=3, col="gray")
plot(tapply(DAT$scap, DAT$average, mean, na.rm=TRUE) ~ sort(unique(DAT$average)), 
     cex=table(DAT$average)/15,
     pch=20, bty="n", 
     xlab="Bac Score", ylab="Average Social Capital")
abline(v=6, lty=3, col="red")
abline(h=tapply(DAT$scap, DAT$average>=6, mean, na.rm=TRUE), lty=3, col="gray")


#############
## Table 1 ##
#############

clustervar <- DAT$average
rd.main <- rdrobust(y=DAT$scap, x=DAT$average, 
                    c=6, h=.2, fuzzy=DAT$p6,
                    vce = "hc0", cluster = clustervar)
summary(rd.main)


##############
## Figure 4 ##
##############

# creating table to hold graduation year specific estimates
year.est.table <- matrix(NA, nrow=5, ncol=3)
rownames(year.est.table) <- 2015:2019
colnames(year.est.table) <- c("tau_hat", "ci_lower", "ci_upper")
# looping over graduation years in our sample (uncluding 2019)
for(i in 1:5){
  subyear <- (2015:2019)[i]
  # conducting same RD analysis as our main result
  # using only gradyear-specific data
  foo <- rdrobust(y=DAT.WITH2019$scap[DAT.WITH2019$GRADYEAR==subyear],
                  x=DAT$average[DAT.WITH2019$GRADYEAR==subyear], 
                  c=6, h=.2, fuzzy=DAT$p6[DAT.WITH2019$GRADYEAR==subyear],
                  vce = "hc0", cluster = DAT.WITH2019$average[DAT.WITH2019$GRADYEAR==subyear])
  # storing estimated coef and CI
  year.est.table[i, ] <- c(foo$coef[1,], foo$ci[3,])
  # note: using "standard" estimate and "robust" CI
}
# creating plot for Figure 3
plot(2015:2019, year.est.table[,1], 
     xlab="Year of Graduation", xaxt="n",
     ylab="Estimated effect of university attendance",
     bty="n", ylim=range(year.est.table), pch=20, col="red")
axis(1, at=2015:2019, labels=2015:2019)
for(i in 1:5){
  segments((2015:2019)[i], year.est.table[i,2], (2015:2019)[i], year.est.table[i,3])
}
abline(h=0, col="gray", lty=3)



##############
## Figure 5 ##
##############

# left pane (childhood SES)
# creating table to hold SES-specific estimates
SES.est.table <- matrix(NA, nrow=5, ncol=3)
rownames(SES.est.table) <- 1:5
colnames(SES.est.table) <- c("tau_hat", "ci_lower", "ci_upper")
# looping over SES values (1 to 5)
for(i in 1:5){
  # conducting same RD analysis as our main result
  # using only SES-specific data
  foo <- rdrobust(y=DAT$scap[DAT$childhoodSES==i],
                  x=DAT$average[DAT$childhoodSES==i], 
                  c=6, h=.2, fuzzy=DAT$p6[DAT$childhoodSES==i],
                  vce = "hc0", cluster = DAT$average[DAT$childhoodSES==i])
  SES.est.table[i, ] <- c(foo$coef[1,], foo$ci[3,])
}
# creating plot
plot(1:5, SES.est.table[,1], 
     xlab="Self-Reported Childhood SES", ylab="Estimated effect of university attendance",
     bty="n", ylim=range(SES.est.table), pch=20, col="red")
for(i in 1:5){
  segments(i, SES.est.table[i,2], i, SES.est.table[i,3])
}
abline(h=0, col="gray", lty=3)

# right pane (father's education)
# creating 4cat father's edu variable
fatheredu.est.table <- matrix(NA, nrow=4, ncol=3)
rownames(fatheredu.est.table) <- 1:4
colnames(fatheredu.est.table) <- c("tau_hat", "ci_lower", "ci_upper")
# looping over values of father's education (1 to 4)
for(i in 1:4){
  # conducting same RD analysis as our main result
  # using only father's education-specific data
  foo <- rdrobust(y=DAT$scap[DAT$fatheredu4==i],
                  x=DAT$average[DAT$fatheredu4==i], 
                  c=6, h=.2, fuzzy=DAT$p6[DAT$fatheredu4==i],
                  vce = "hc0", cluster = DAT$average[DAT$fatheredu4==i])
  fatheredu.est.table[i, ] <- c(foo$coef[1,], foo$ci[3,])
}
# making figure
plot(1:4, fatheredu.est.table[,1], 
     xlab="Father's Education (4 Category)", ylab="Estimated effect of university attendance",
     bty="n", ylim=range(fatheredu.est.table), 
     xaxt="n",pch=20, col="red")
axis(1, at=1:4, labels=1:4)
for(i in 1:4){
  segments(i, fatheredu.est.table[i,2], i, fatheredu.est.table[i,3])
}
abline(h=0, col="gray", lty=3)


#############
## Table 2 ##
#############
cor(cbind(DAT$ivbar, DAT$itbar, DAT$imbar, DAT$ipbar),
    use="p")


##############
## Figure 6 ##
##############
par(mfrow=c(2,2))
hist(DAT$ivbar, main="", xlab="Voting", 
     ylab="", yaxt="n")
hist(DAT$itbar, main="", xlab="Trust", 
     ylab="", yaxt="n")
hist(DAT$imbar, main="", xlab="Membership in Organizations", 
     ylab="", yaxt="n")
hist(DAT$ipbar, main="", xlab="Other Political Participation", 
     ylab="", yaxt="n")



#############
## Table 3 ##
#############

## Voting
clustervar <- DAT$average
rd.voting <- rdrobust(y=DAT$ivbar, x=DAT$average, 
                      c=6, h=.2, fuzzy=DAT$p6,
                      vce = "hc0", cluster = clustervar)
summary(rd.voting)
## Trust
clustervar <- DAT$average
rd.trust <- rdrobust(y=DAT$itbar, x=DAT$average, 
                     c=6, h=.2, fuzzy=DAT$p6,
                     vce = "hc0", cluster = clustervar)
summary(rd.trust)
## Membership
clustervar <- DAT$average
rd.membership <- rdrobust(y=DAT$imbar, x=DAT$average, 
                          c=6, h=.2, fuzzy=DAT$p6,
                          vce = "hc0", cluster = clustervar)
summary(rd.membership)
## Other Political Participation
clustervar <- DAT$average
rd.participation <- rdrobust(y=DAT$ipbar, x=DAT$average, 
                             c=6, h=.2, fuzzy=DAT$p6,
                             vce = "hc0", cluster = clustervar)
summary(rd.participation)


## EU VOTE RD (referenced in text)
clustervar <- DAT$average
DAT$EUvote <- recode(DAT$q4, 
                     '"Don\'t remember"=0; "No"=0; "Not old enough"=NA; "Yes"=1; else=NA', 
                     as.numeric=TRUE)
rd.EUvote <- rdrobust(y=DAT$EUvote, x=DAT$average, 
                      c=6, h=.2, fuzzy=DAT$p6,
                      vce = "hc0", cluster = clustervar)
summary(rd.EUvote)

## Individual Trust Question Analyses
## (referenced in text)
clustervar <- DAT$average
# general trust in people
rd.trust.people <- rdrobust(y=6-DAT$q2_1, x=DAT$average, 
                            c=6, h=.2, fuzzy=DAT$p6,
                            vce = "hc0", cluster = clustervar)
summary(rd.trust.people)
# trust in political leaders
rd.trust.leaders <- rdrobust(y=6-DAT$q2_2, x=DAT$average, 
                             c=6, h=.2, fuzzy=DAT$p6,
                             vce = "hc0", cluster = clustervar)
summary(rd.trust.leaders)


#######################################
## APPENDIX PAPER TABLES AND FIGURES ##
#######################################

##############
## Table B1 ##
##############
# gender
prop.table(table(DAT.WITH2019$p11))
# single (never married)
prop.table(table(DAT.WITH2019$p12=="Single"))
# one or more children
prop.table(table(DAT.WITH2019$p13>=1))
# employed
prop.table(table(DAT.WITH2019$p14_1=="Yes"))
# Romanian ethnicity
prop.table(table(DAT.WITH2019$p23[DAT.WITH2019$p23!=""]=="român"))
# father's education level (4 cat)
prop.table(table(DAT.WITH2019$fatheredu4))
# childhood SES
prop.table(table(DAT.WITH2019$childhoodSES))
# current SES
prop.table(table(DAT.WITH2019$p15[DAT.WITH2019$p15!=""]))
# age
prop.table(table(2020-DAT.WITH2019$p17))


###############
## Figure C1 ##
###############
# vector of bandwidths to use
bandwidth.radius <- c(.075,.1, .125,.15, .175,.2)
# creating table to store estimates, CIs
bandwidth.est.table <- matrix(NA, nrow=length(bandwidth.radius), ncol=3)
rownames(bandwidth.est.table) <- bandwidth.radius
colnames(bandwidth.est.table) <- c("tau_hat", "ci_lower", "ci_upper")
# looping over each bandwidth value
for(i in 1:length(bandwidth.radius)){
  foo <- rdrobust(y=DAT$scap,
                  x=DAT$average, 
                  c=6, h=bandwidth.radius[i], 
                  fuzzy=DAT$p6,
                  vce = "hc0", 
                  cluster = DAT$average)
  bandwidth.est.table[i, ] <- c(foo$coef[1,], foo$ci[3,])
}
# making plot of results
plot(bandwidth.radius, bandwidth.est.table[,1], 
     xlab="Bandwidth Radius", 
     ylab="Estimated effect of university attendance",
     bty="n", ylim=c(-1,2), 
     pch=20, col="red", xaxt="n")
axis(1, at=bandwidth.radius, labels=substr(bandwidth.radius, 2, 5), las=2)
for(i in 1:length(bandwidth.radius)){
  segments(bandwidth.radius[i], bandwidth.est.table[i,2], 
           bandwidth.radius[i], bandwidth.est.table[i,3])
}
abline(h=0, col="gray", lty=3)


###############
## Figure C2 ##
###############
# vector of donut hole radius
donut.radius <- c(0, .02, .04, .06, .08, .1)
# matrix to store estimates and CIs
donut.est.table <- matrix(NA, nrow=length(donut.radius), ncol=3)
rownames(donut.est.table) <- donut.radius
colnames(donut.est.table) <- c("tau_hat", "ci_lower", "ci_upper")
# looping over donut radius values
for(i in 1:length(donut.radius)){
  foo <- rdrobust(y=DAT$scap[DAT$average < 6 - donut.radius[i] |
                               DAT$average >= 6 + donut.radius[i]],
                  x=DAT$average[DAT$average < 6 - donut.radius[i] | 
                                  DAT$average >= 6 + donut.radius[i]], 
                  c=6, h=.2, 
                  fuzzy=DAT$p6[DAT$average < 6 - donut.radius[i] | 
                                 DAT$average >= 6 + donut.radius[i]],
                  vce = "hc0", 
                  cluster = DAT$average[DAT$average < 6 - donut.radius[i] | 
                                          DAT$average >= 6 + donut.radius[i]])
  donut.est.table[i, ] <- c(foo$coef[1,], foo$ci[3,])
}
# making plot of estimates
plot(donut.radius, donut.est.table[,1], 
     xlab="Donut Hole Radius", ylab="Estimated effect of university attendance",
     bty="n", ylim=range(donut.est.table), pch=20, col="red")
for(i in 1:length(donut.radius)){
  segments(donut.radius[i], donut.est.table[i,2], 
           donut.radius[i], donut.est.table[i,3])
}
abline(h=0, col="gray", lty=3)

###############
## Figure C3 ##
###############
plot(tapply(DAT$fatheredu, DAT$average, mean, na.rm=TRUE) ~ sort(unique(DAT$average)), 
     cex=table(DAT$average)/15,
     pch=20, bty="n", 
     xlab="Bac Score", ylab="Father's Education")
abline(v=6, lty=3, col="red")


##############
## Table C1 ##
##############
clustervar <- DAT$average
rd.fatheredu <- rdrobust(y=DAT$fatheredu, x=DAT$average, 
                         c=6, h=.2, fuzzy=DAT$p6,
                         vce = "hc0", cluster = clustervar)
summary(rd.fatheredu)


###############
## Figure C4 ##
###############
plot(tapply(DAT$childhoodSES, DAT$average, mean, na.rm=TRUE) ~ sort(unique(DAT$average)), 
     cex=table(DAT$average)/15,
     pch=20, bty="n", 
     xlab="Bac Score", ylab="Childhood Socio-Economic Status")
abline(v=6, lty=3, col="red")


##############
## Table C2 ##
##############
clustervar <- DAT$average
rd.childhoodSES <- rdrobust(y=DAT$childhoodSES, x=DAT$average, 
                            c=6, h=.2, fuzzy=DAT$p6,
                            vce = "hc0", cluster = clustervar)
summary(rd.childhoodSES)


##############
## Table C3 ##
##############
# Note: "T" is estimated treatment effect
# minimum bandwidth (only closest values to threshold)
rdloc.closest <- rdrandinf(Y=DAT$scap,
                           R=DAT$average, 
                           cutoff=6, 
                           wl=6 - .02, 
                           wr=6 + .001, 
                           fuzzy=DAT$p6)
# table of local randomization effect estimates
# using different bandwidths
# note: .2 bandwidth uses full sample
bandwidth.radius <- c(.075,.1, .125,.15, .175,.2)
locrand.bw.table <- matrix(NA, nrow=length(bandwidth.radius), ncol=2)
row.names(locrand.bw.table) <- bandwidth.radius
colnames(locrand.bw.table) <- c("est", "p-value")
for(i in 1:length(bandwidth.radius)){
  foo <- rdrandinf(Y=DAT$scap,
                   R=DAT$average, 
                   cutoff=6, 
                   wl=6 - bandwidth.radius[i], 
                   wr=6 + bandwidth.radius[i], 
                   fuzzy=DAT$p6)
  locrand.bw.table[i, 1] <- foo$obs.stat
  locrand.bw.table[i, 2] <- foo$p.value
}
## printing bandwidth-specific estimates and CIs
locrand.bw.table


##############
## Table C4 ##
##############
# father's education
rd.locrand.fatheredu <- rdrandinf(Y=DAT$fatheredu[!is.na(DAT$fatheredu) & 
                                                    !is.na(DAT$average) & 
                                                    !is.na(DAT$p6)],
                                  R=DAT$average[!is.na(DAT$fatheredu) & 
                                                  !is.na(DAT$average) & 
                                                  !is.na(DAT$p6)],
                                  cutoff=6, wl=5.8, wr=6.2, 
                                  fuzzy=DAT$p6[!is.na(DAT$fatheredu) & 
                                                 !is.na(DAT$average) & 
                                                 !is.na(DAT$p6)], 
                                  ci=c(.05, seq(-1,1, by=.01)))
# childhood SES
rd.locrand.childhoodSES <- rdrandinf(Y=DAT$childhoodSES[!is.na(DAT$childhoodSES) & 
                                                          !is.na(DAT$average) & 
                                                          !is.na(DAT$p6)],
                                     R=DAT$average[!is.na(DAT$childhoodSES) & 
                                                     !is.na(DAT$average) & 
                                                     !is.na(DAT$p6)],
                                     cutoff=6, wl=5.8, wr=6.2, 
                                     fuzzy=DAT$p6[!is.na(DAT$childhoodSES) & 
                                                    !is.na(DAT$average) & 
                                                    !is.na(DAT$p6)], 
                                     ci=c(.05, seq(-1,1, by=.01)))



## Automated bandwidth selection for local randomization analysis
## (this is discussed in the appendix text, not a table or figure)
rdwinselect(R=DAT$average, 
            X=cbind(DAT$fatheredu, DAT$childhoodSES),
            cutoff=6)
# relevant output for this analysis is: 
#  "Recommended window is [5.9;6.1]"
